cars <- read.csv("CarPriceData.csv")

Task 0

The goal of this project is to use multiple regression to predict car prices. We are going to clean, engineer and remove data from the given cars data found here. We will the go through the remaining data and create visuals and models. Based on the visuals more data will be removed and a model will be created from the remaining data that helps creates a best fit. The model will then be used to predict 20 unknown prices from the data. After the predictions are made they will be submitted in a competition against other models to see which model is the best.

Task 1

RMSE <- function(predict, obs) {
  RMSE <- sqrt(mean((predict - obs)^2, na.rm = TRUE))
  return(RMSE)
}

fixMake <- function(make) {
  make <- tolower(make)
  fixed <- c()
  for (val in make) {
    switch(val,
      "alfa-romero" = {
        fixed <- c(fixed, "alfa-romeo")
      },
      "maxda" = {
        fixed <- c(fixed, "mazda")
      },
      "porcshce" = {
        fixed <- c(fixed, "porsche")
      },
      "toyouta" = {
        fixed <- c(fixed, "toyota")
      },
      "vokswagen" = {
        fixed <- c(fixed, "volkswagen")
      },
      "vw" = {
        fixed <- c(fixed, "volkswagen")
      },
      {
        fixed <- c(fixed, val)
      }
    )
  }
  return(fixed)
}

buildIntervals <- function(data, model) {
  confidence <- as.data.frame(predict.lm(model, newdata = data, interval = "confidence")) %>%
    rename(confLwr = lwr, confUpr = upr)

  prediction <- as.data.frame(predict.lm(model, newdata = data, interval = "prediction")) %>%
    rename(predictLwr = lwr, predictUpr = upr) %>%
    select(predictLwr, predictUpr)

  intervalData <- cbind(data, confidence, prediction)

  return(intervalData)
}

Removing Varibles

cars2 <- cars %>%
  filter(!is.na(price)) %>%
  select(
    -X,
    -wheelbase,
    -enginetype,
    -fuelsystem,
    -symboling,
    -stroke,
    -peakrpm,
    -boreratio,
    -doornumber,
    -cylindernumber,
    -enginelocation
  )
  • enginetype: This can be be explained by enginesize.
  • wheelbase: The data varies greatly and we choose not to use it to predict price.
  • fuelsystem: There are factors that vary little to one data point.
  • symboling: The price is not set by this.
  • stroke: This can be be explained by enginesize.
  • peakrpm: This can be explained by horsepower.
  • cyclindernumber: This can be be explained by enginesize.
  • boreratio: This can be be explained by enginesize.
  • doornumber: This is explained by carbody.
  • enginelocation: There are only three cars in the data set with rear engines and they have other similar traits
  • X: Same as car_ID

Data Engineering

Engineering

cars3 <- cars2 %>%
  separate(CarName, into = c("make", "model"), sep = " ") %>%
  mutate(
    make = fixMake(make),
    projectedVolume = carwidth * carlength * carheight,
    avgMPG = (citympg + highwaympg) / 2,
    logHP = log10(horsepower),
    logPrice = log10(price)
  )
  • make: The manufacturer of the vechicle created by separating CarName. (Categorical)
  • model: The manufacturer of the vechicle created by separating CarName. (Categorical)
  • projectedVolume: The projected volume of the car created by $carwidth carlength carheight $. (Numeric)
  • avgMPG: The average of citympg and highwaymph. (Numeric)
  • logHp: The number of digits in the horsepower. (Numeric)
  • logPrice: The number of digits in the price. (Numeric)

Cleaning

cars4 <- cars3 %>%
  select(
    -model,
    -citympg,
    -highwaympg,
    -carwidth,
    -carlength,
    -carheight,
    -horsepower
  ) %>% 
  mutate(
    make = as.factor(make),
    carbody = as.factor(carbody),
    fueltype = as.factor(fueltype)
    )
  • CarName: Split into make and model.
  • model: This is not important information.
  • citympg: citympg is now explained in avgMPG
  • highwaymph: highwaymph is now explained in avgMPG
  • carwidth: carwidth is now explained in projectedVolume
  • carlength: carlength is now explained in projectedVolume
  • carheight: carheight is now explained in projectedVolume
  • horsepower: horsepower is now explained in logHP
  • cylindernumber: cylindernumber is now explained in hpPerCyl

Visualization

Figure 1: Size of Car vs Price

model1 <- lm(logPrice ~ enginesize, data = cars4)

df1 <- buildIntervals(cars4, model1)

ggplot(df1, aes(x = enginesize)) +
  geom_point(aes(y = price), alpha = 0.5) +
  geom_ribbon(aes(ymin = 10^confLwr, ymax = 10^confUpr), fill = "yellow", alpha = 0.4) +
  geom_line(aes(y = 10^fit), color = "#3366FF", size = 0.75) +
  geom_line(aes(y = 10^confLwr), linetype = "dashed", size = 0.75) +
  geom_line(aes(y = 10^confUpr), linetype = "dashed", size = 0.75) +
  geom_line(aes(y = 10^predictLwr), linetype = "dashed", color = "red", size = 0.75) +
  geom_line(aes(y = 10^predictUpr), linetype = "dashed", color = "red", size = 0.75) +
  labs(
    title = "Figure 1: Size of Car vs Price",
    x = bquote("Size of Car" ~ ("in"^3)),
    y = "Price ($)",
  )

slope1 <- round(summary(model1)$coefficients[2, 1], 3)
intercept1 <- round(summary(model1)$coefficients[1, 1], 3)

Figure 1 shows that the price of the car can be explained by the size of the car’s (enginesize) using the formula \(price = 10^{0.004 \cdot enginesize} \cdot 3250.8729739\).

Figure 2: Car Weight vs Price

model2 <- lm(logPrice ~ curbweight, data = cars4)

df2 <- buildIntervals(cars4, model2)

ggplot(df2, aes(x = curbweight)) +
  geom_point(aes(y = price), alpha = 0.5) +
  geom_ribbon(aes(ymin = 10^confLwr, ymax = 10^confUpr), fill = "yellow", alpha = 0.4) +
  geom_line(aes(y = 10^fit), color = "#3366FF", size = 0.75) +
  geom_line(aes(y = 10^confLwr), linetype = "dashed", size = 0.75) +
  geom_line(aes(y = 10^confUpr), linetype = "dashed", size = 0.75) +
  geom_line(aes(y = 10^predictLwr), linetype = "dashed", color = "red", size = 0.75) +
  geom_line(aes(y = 10^predictUpr), linetype = "dashed", color = "red", size = 0.75) +
  labs(
    title = "Figure 2: Car Weight vs Price",
    x = "Car Weight (lbs)",
    y = "Price ($)",
  )

slope2 <- round(summary(model2)$coefficients[2, 1], 4)
intercept2 <- round(summary(model2)$coefficients[1, 1], 4)

Figure 2 shows that the price of the car can also be explained by the weight of the car (curbweight) using the formula \(price = 10^{0.0004 \cdot curbweight} \cdot 1250.2590302\).

Figure 3: Engine Size vs Weight of Car

model3 <- lm(curbweight ~ enginesize, data = cars4)

df3 <- buildIntervals(cars4, model3)

ggplot(df3, aes(x = enginesize)) +
  geom_point(aes(y = curbweight), alpha = .75) +
  geom_smooth(aes(y = curbweight), method = lm, fill = "yellow") +
  geom_line(aes(y = confLwr), linetype = "dashed") +
  geom_line(aes(y = confUpr), linetype = "dashed") +
  geom_line(aes(y = predictLwr), linetype = "dashed", color = "red") +
  geom_line(aes(y = predictUpr), linetype = "dashed", color = "red") +
  labs(
    title = "Figure 3: Engine Size vs Weight of Car",
    x = "Weight of Car (lbs)",
    y = "Engine Size (CID)",
  )

slope3 <- round(summary(model3)$coefficients[2, 1], 3)
intercept3 <- round(summary(model3)$coefficients[1, 1], 3)

Figure 3 shows that the size of the car’s (enginesize) can be explained by the weight of the car (curbweight) using the formula \(enginesize = 10.476 \cdot curbweight + 1221.953\). This means that curbweight and enginesize are colinear and only one should be used in the final model. Since Figure 3 show that price is better explained by curbweight; enginesize will be thrown out of the data later on in favor of curbweight.

Figure 4: Log Horsepower vs Price

model4 <- lm(logPrice ~ logHP, data = cars4)

df4 <- buildIntervals(cars4, model4)

ggplot(df4, aes(x = logHP)) +
  geom_point(aes(y = price), alpha = 0.5) +
  geom_ribbon(aes(ymin = 10^confLwr, ymax = 10^confUpr), fill = "yellow", alpha = 0.4) +
  geom_line(aes(y = 10^fit), color = "#3366FF", size = 0.75) +
  geom_line(aes(y = 10^confLwr), linetype = "dashed", size = 0.75) +
  geom_line(aes(y = 10^confUpr), linetype = "dashed", size = 0.75) +
  geom_line(aes(y = 10^predictLwr), linetype = "dashed", color = "red", size = 0.75) +
  geom_line(aes(y = 10^predictUpr), linetype = "dashed", color = "red", size = 0.75) +
  labs(
    title = "Figure 4: Horsepower vs Price",
    x = "Log Horsepower",
    y = "Price ($)",
  )

slope4 <- round(summary(model3)$coefficients[2, 1], 3)
intercept4 <- round(summary(model3)$coefficients[1, 1], 3)

Figure 4 shows that the price of the car can also be explained by log of horsepower (logHP) using the formula \(price = 10.476 \cdot 10^{logHP} + 1221.953\).

Figure 5: Car Make vs Average Price

ggplot(df4, aes(x = fct_reorder(make, price, .fun = mean), y = price)) +
  geom_bar(stat = "summary", fun = "mean", fill = "#00bfc4") +
  labs(
    title = "Figure 5: Car Make vs Average Price",
    x = "Car Make",
    y = "Average price",
  ) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Figure 5 show the average price varies greatly based on the make of the car. This means when that make should be considered in our final model.

Figure 6: Car Body vs Average Price

ggplot(cars4, aes(x = fct_reorder(carbody, price, .fun = mean), y = price)) +
  geom_bar(stat = "summary", fun = "mean", fill = "#c77cff") +
  labs(
    title = "Figure 6: Car Body vs Average Price",
    x = "Car Body",
    y = "Average Price ($)",
  ) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Figure 6 show the average price varies based on the body of the car (carbody). This means when that carbody could be considered in our final model.

Figure 7: Fuel Type vs Average Price

ggplot(cars4, aes(x = fct_reorder(fueltype, price, .fun = mean), y = price, fill = fueltype)) +
  geom_bar(stat = "summary", fun = "mean", show.legend = FALSE) +
  labs(
    title = "Figure 7: Fuel Type vs Price",
    x = "Fuel Type",
    y = "Average Price ($)",
  ) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Figure 7 show the average price varies based on the fuel type of the car (fueltype). This means when that fueltype could be considered in our final model.

Task 3

Building Model

cars5 <- cars4 %>%
  select(-enginesize)

testModel <- lm(logPrice ~ . - car_ID - price, data = cars5)
bestSelect <- ols_step_best_subset(testModel)

To start off we dropped the enginesize data because of our observation in Figure 3. After dropping enginesize we can create a model that includes all the data we did not remove. This model excludes car_ID and price. Now a ols_step_best_subset can be ran on the model. Now an ols_step_best_subset will create subset models of all our variables and output the best model for each number of variables in the formula. The looking at the result we can determined that the best number of are five; the variables being listed below.

Model

finalModel <- lm(logPrice ~ curbweight + logHP + make + carbody + fueltype, data = cars5)
finalSummary <- summary(finalModel)

coefficients <- round(finalSummary$coefficients[, 1], 4)

numCoe <- length(coefficients)

After finding these variables we can create a model for them. Using the newly created model we can find the following function:

\[ price = 2.819032 \cdot horsepower + 10^{0.0002 \cdot {curbweight}} \\~\\ \cdot 10^{0.0839 \cdot 1_{make = Audi}} \cdot 10^{0.1768 \cdot 1_{make = BMW}} \\~\\ \cdot 10^{0.0974 \cdot 1_{make = Buick}} \cdot 10^{-0.0351 \cdot 1_{make = Chevrolet}} \\~\\ \cdot 10^{-0.0658 \cdot 1_{make = Dodge}} \cdot 10^{-0.0295 \cdot 1_{make = Honda}} \\~\\ \cdot 10^{-0.0121 \cdot 1_{make = Isuzu}} \cdot 10^{0.0146 \cdot 1_{make = Jaguar}} \\~\\ \cdot 10^{0.0084 \cdot 1_{make = Mazda}} \cdot 10^{-0.0291 \cdot 1_{make = Mercury}} \\~\\ \cdot 10^{-0.0924 \cdot 1_{make = Mitsubishi}} \cdot 10^{-0.0427 \cdot 1_{make = Nissan}} \\~\\ \cdot 10^{-0.0465 \cdot 1_{make = Peugeot}} \cdot 10^{-0.0773 \cdot 1_{make = Plymouth= yes}} \\~\\ \cdot 10^{0.1902 \cdot 1_{make = Porsche= yes}} \cdot 10^{-0.0719 \cdot 1_{make = Renault= yes}} \\~\\ \cdot 10^{0.0246 \cdot 1_{make = Saab}} \cdot 10^{-0.0761 \cdot 1_{make = Subaru}} \\~\\ \cdot 10^{-0.0544 \cdot 1_{make = Toyota}} \cdot 10^{-0.0141 \cdot 1_{make = Volkswagen}} \\~\\ \cdot 10^{0.0238 \cdot 1_{make = Volvo}} \cdot 10^{-0.0676 \cdot 1_{carbody = hardtop}} \\~\\ \cdot 10^{-0.0978 \cdot 1_{carbody = hatchback}} \cdot 10^{-0.0783 \cdot 1_{carbody = sedan}} \\~\\ \cdot 10^{-0.106 \cdot 1_{carbody = wagon}} \cdot 10^{-0.052 \cdot 1_{fueltype = gas}} \\~\\ \cdot 10^{2.7858}\]

Summary

minWeight <- min(cars$curbweight)
maxWeight <- max(cars$curbweight)
weightExample <- round((10^coefficients[2])^2250, 4)
meanIncrease <- round(mean((10^coefficients[2])^cars$curbweight), 4)

minMake <- round(10^min(coefficients[4:23]), 4)
maxMake <- round(10^max(coefficients[4:23]), 4)

minBody <- round(10^min(coefficients[24:28]), 4)
maxBody <- round(10^max(coefficients[24:28]), 4)

Let’s take a deeper look at each variable affects the model.

  • horsepower: Every time the horsepower increases by \(1\) the price increases by \(2.819032\).
  • curbweight: The price of the car is multiplied by \(1.0005^{curbweight}\). For example if you have car that weights \(2250\) lbs then the price multiplied by \(2.8184\); because cars weight range from \(1488\) to \(4066\) the average amount the price is multiplied by is \(3.3435\). The is important because curbweight is about tripling the price on average.
  • make: The priceof increases or decreases based on the make. It does this by multiplying the price by \(0.8084\) a the lowest and \(1.5495\) at the highest. make is important because it can greatly lower the price or slightly increase the price.
  • carbody: Multiplies price by \(0.7834\) a the lowest and \(1.0563\) at the highest.
  • fueltype: If the fueltype is gas then the price is multiply by \(0.8872\).
  • Now that we understand the model a little more let take a look at how well our model fits the data.

    corCoff <- round(cor(cars5$price, 10^finalModel$fitted.values), 4)
    carRMSE <- round(RMSE(10^finalModel$fitted.values, cars5$price), 4)
    • Correlation Coefficient: 0.9722
    • RMSE: 1916.0413

    The correlation coefficient and RMSE look pretty good so lets take a look at the Residuals vs Fitted of the model and the Q-Q plot of the model.

    autoplot(finalModel,1:2)

    Looking at the Residuals vs Fitted above the data look pretty random which is good, but the Q-Q shows that the model is pretty good towards the mean price, but it not great at estimating Very High or Very Low prices.

Task 4

Now that we create a model we can use it to predict the prices for the cars without values.

cars6 <- cars %>%
  filter(is.na(price)) %>%
  select(
    -X,
    -wheelbase,
    -enginetype,
    -fuelsystem,
    -symboling,
    -stroke,
    -peakrpm,
    -boreratio,
    -doornumber,
    -cylindernumber,
    -enginelocation
  ) %>%
  separate(CarName, into = c("make", "model"), sep = " ") %>%
  mutate(
    make = fixMake(make),
    projectedVolume = carwidth * carlength * carheight,
    avgMPG = (citympg + highwaympg) / 2,
    logHP = log10(horsepower),
    logPrice = log10(price)
  ) %>%
  select(
    -model,
    -citympg,
    -highwaympg,
    -carwidth,
    -carlength,
    -carheight,
    -horsepower
  )

cars7 <- buildIntervals(cars6, finalModel) %>%
  select(car_ID, fit, predictLwr, predictUpr) %>%
  mutate(fit = 10^fit, predictLwr = 10^predictLwr, predictUpr = 10^predictUpr) %>%
  rename(price = fit, lower = predictLwr, upper = predictUpr)

Predictions

kable(cars7)
car_ID price lower upper
14 21925.112 16814.779 28588.575
28 8838.829 6767.418 11544.271
62 10072.907 7788.371 13027.558
70 30075.819 22600.031 40024.499
79 6058.387 4662.255 7872.596
80 7775.771 5985.254 10101.928
90 6771.282 5222.411 8779.518
98 6816.032 5226.726 8888.604
101 9606.346 7420.542 12436.002
103 17699.608 13547.104 23124.952
111 17060.152 13035.431 22327.514
116 13754.299 10548.827 17933.818
126 23450.790 17532.957 31366.047
132 8951.273 6257.751 12804.166
133 13442.702 10161.954 17782.628
137 17088.992 12896.470 22644.464
144 8930.717 6874.070 11602.692
150 10464.163 8024.122 13646.192
166 9801.112 7554.124 12716.471
184 9250.621 7123.241 12013.351